package contentSpider;

import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

public class ContentCaptureBlog implements ContentCapture {
	
	Log logger = LogFactory.getLog(ContentCaptureBlog.class);
	
	// 抓取字符串中的有效内容部分
	public String getContent(String bodyString) {
		
		Pattern titlePattern = Pattern.compile(titleRex);
		Matcher titleMatcher = titlePattern.matcher(bodyString);
		if (titleMatcher.find()) {
			title = titleMatcher.group(2).trim();
		}
//System.out.println(bodyString);
		System.setProperty("LOG_DIR", "./");
		String finalContent = null;
		Pattern p = Pattern.compile(contentRex);
		Matcher m = p.matcher(bodyString);
		while (m.find()) {
			finalContent = m.group(2).trim();
		}
		Pattern pDel = Pattern.compile(delRex,Pattern.CASE_INSENSITIVE);
//System.out.println(finalContent);
		Matcher mDel = pDel.matcher(finalContent);
		return mDel.replaceAll("").replaceAll(">\\s*", ">").replaceAll("\\s*<", "<");
	}

	String contentRex = null;

	public void setContentRex(String contentRex) {
		this.contentRex = contentRex;
	}
	
	public String getTitle() {
		return title;
	}
	
	String titleRex = null;
	public void setTitleRex(String titleRex) {
		this.titleRex = titleRex;
	}

	String title = null;
	String delRex = null;
	public void setDelRex(String delRex) {
		this.delRex = delRex;
		//System.out.println(delRex);
	}
}
